Validation¶
Validation of algorithms and transformation results.
Chap 7. Validação
- Section 7.3 Resultados
- Section 7.3.2 Semantic Annotation
- Classificação de regras operativas, fatos, termos e nomes
- Section 7.3.3 nlp2sbvr
- Section 7.3.2 Semantic Annotation
Google colab¶
%load_ext autoreload
%autoreload 2
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
from google.colab import drive
drive.mount('/content/drive')
!rm -rf cfr2sbvr configuration checkpoint
!git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
%pip install -r cfr2sbvr/code/requirements.txt
!cp -r cfr2sbvr/code/src/configuration .
!cp -r cfr2sbvr/code/src/checkpoint .
!cp -r cfr2sbvr/code/config.colab.yaml config.yaml
DEFAULT_CONFIG_FILE="config.yaml"
else:
DEFAULT_CONFIG_FILE="../config.yaml"
Imports¶
# Standard library imports
import json
import os
import time
from datetime import datetime
from typing import List
# Local application/library-specific imports
import logging_setup.main as logging_setup
import matplotlib.pyplot as plt
import missingno as mi
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import rules_taxonomy_provider.main as rules_taxonomy_provider
# Third-party imports
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, spearmanr
from openai import OpenAI
from pydantic import BaseModel, Field
# Local modules
import configuration.main as configuration
import checkpoint.main as checkpoint
from checkpoint.main import (
Document,
get_all_checkpoints,
get_elements_from_checkpoints,
restore_checkpoint,
save_checkpoint,
)
import llm_query.main as llm_query
from llm_query.main import query_instruct_llm
from rules_taxonomy_provider.main import RulesTemplateProvider
DEV_MODE = True
if DEV_MODE:
# Development mode
import importlib
importlib.reload(configuration)
importlib.reload(logging_setup)
importlib.reload(checkpoint)
importlib.reload(llm_query)
importlib.reload(rules_taxonomy_provider)
# Ensure plots are displayed inline if using a Jupyter notebook
%matplotlib inline
from IPython.display import display
Settings¶
Configuration¶
# Load configuration
config = configuration.load_config(DEFAULT_CONFIG_FILE)
Logging¶
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])
2024-12-14 13:12:23 - INFO - Logging is set up with daily rotation.
Checkpoints¶
Restore the checkpoint¶
# Restore the checkpoint
# To run after extraction
last_checkpoint = configuration.get_last_filename(
config["DEFAULT_CHECKPOINT_DIR"], "documents", "json"
)
logger.info(f"{last_checkpoint=}")
config["DEFAULT_CHECKPOINT_FILE"] = last_checkpoint
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])
2024-12-14 13:12:23 - INFO - last_checkpoint='../data/checkpoints/documents-2024-12-08-10.json' 2024-12-14 13:12:23 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-14 13:12:23 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-12-08-10.json.
General functions¶
def remove_section_symbol(input_string: str) -> str:
"""
Removes the '§' symbol from the input string and trims whitespace.
Args:
input_string (str): The string from which to remove the '§' symbol.
Returns:
str: The cleaned string without the '§' symbol and leading/trailing whitespace.
Raises:
TypeError: If 'input_string' is not a string.
"""
if not isinstance(input_string, str):
raise TypeError("input_string must be a string")
return input_string.replace("§", "").strip()
def prompt_analysis(raw_data, output_dir):
# Create a DataFrame from the raw data
data = pd.DataFrame(
raw_data,
columns=["filename", "doc_type", "elapsed_time", "usage", "created", "model"],
)
# Transform 'created' to a human-readable datetime format
data["created"] = pd.to_datetime(data["created"], unit="s")
# Extract relevant information from the 'usage' dictionary
data["completion_tokens"] = data["usage"].apply(lambda x: x["completion_tokens"])
data["prompt_tokens"] = data["usage"].apply(lambda x: x["prompt_tokens"])
data["total_tokens"] = data["usage"].apply(lambda x: x["total_tokens"])
# Define a function to get reference model context length
def get_reference_model_context_length(model):
return reference_models.get(
model, 128_000
) # Default to 128,000 if model is unknown
# Define a function to get the price per million tokens
def get_price_per_million_tokens(model):
return price_per_million_tokens.get(
model, 2.50
) # Default to 2.50 if model is unknown
# Add context length and price per million tokens columns
data["reference_context_length"] = data["model"].apply(
get_reference_model_context_length
)
data["price_per_million_tokens"] = data["model"].apply(get_price_per_million_tokens)
# Overall Statistics
total_tokens = data["total_tokens"].sum()
num_samples = len(data)
average_elapsed_time = data["elapsed_time"].mean()
estimated_cost = (
data["total_tokens"] / 1_000_000 * data["price_per_million_tokens"]
).sum()
average_percentage_context_length = (
data["total_tokens"] / data["reference_context_length"]
).mean() * 100
min_created = data["created"].min().strftime("%Y-%m-%d %H:%M:%S")
max_created = data["created"].max().strftime("%Y-%m-%d %H:%M:%S")
# Add filename column to each statistic for origin tracking
filename = file_info["filename"]
# Data and time of the execution
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Create Overall Statistics DataFrame
overall_stats_df = pd.DataFrame(
[
{
"Total Tokens": total_tokens,
"Number of Samples": num_samples,
"Average Elapsed Time (s)": average_elapsed_time,
"Estimated Cost (USD)": estimated_cost,
"Average Percentage of Context Length (%)": average_percentage_context_length,
"Min Created Timestamp": min_created,
"Max Created Timestamp": max_created,
"origin": filename,
"run_at": now,
}
]
)
# Statistics by Sample Type (doc_type)
stats_by_doc_type = (
data.groupby("doc_type")
.agg(
total_tokens=("total_tokens", "sum"),
num_samples=("doc_type", "count"),
average_elapsed_time=("elapsed_time", "mean"),
average_tokens=("total_tokens", "mean"),
estimated_cost=(
"total_tokens",
lambda x: (x.sum() / 1_000_000)
* data.loc[x.index, "price_per_million_tokens"].mean(),
),
average_percentage_context_length=(
"total_tokens",
lambda x: (
x.mean() / data.loc[x.index, "reference_context_length"].mean()
)
* 100,
),
)
.reset_index()
)
stats_by_doc_type["filename"] = filename
stats_by_doc_type["run_at"] = now
# Statistics by Model
stats_by_model = (
data.groupby("model")
.agg(
total_tokens=("total_tokens", "sum"),
num_samples=("model", "count"),
average_elapsed_time=("elapsed_time", "mean"),
average_tokens=("total_tokens", "mean"),
average_percentage_context_length=(
"total_tokens",
lambda x: (x.mean() / get_reference_model_context_length(x.name)) * 100,
),
)
.reset_index()
)
stats_by_model["filename"] = filename
stats_by_model["run_at"] = now
# Add estimated cost and cost columns separately since they require different calculations
def calculate_group_cost(model):
price = get_price_per_million_tokens(model)
total_tokens = data[data["model"] == model]["total_tokens"].sum()
return (total_tokens / 1_000_000) * price
stats_by_model["estimated_cost"] = stats_by_model["model"].apply(
calculate_group_cost
)
stats_by_model["cost"] = stats_by_model["estimated_cost"]
# Calculate Tokens per Second
# Ensure there are no division by zero issues by filtering out zero elapsed times
data = data[data["elapsed_time"] > 0]
data["tokens_per_second"] = data["total_tokens"] / data["elapsed_time"]
# Write the statistics to an Excel file
file_name = os.path.join(output_dir, "prompt-analysis.xlsx")
with pd.ExcelWriter(file_name, engine="openpyxl") as writer:
# Replace the data on each sheet with the new data
overall_stats_df.to_excel(writer, sheet_name="Overall Statistics", index=False)
stats_by_doc_type.to_excel(
writer, sheet_name="Statistics by Sample Type", index=False
)
stats_by_model.to_excel(writer, sheet_name="Statistics by Model", index=False)
additional_stats_df = pd.DataFrame(
[
{
"Average Completion Tokens": data["completion_tokens"].mean(),
"Average Prompt Tokens": data["prompt_tokens"].mean(),
"Average Total Tokens per Sample": data["total_tokens"].mean(),
"Total Elapsed Time (s)": data["elapsed_time"].sum(),
"Average Tokens per Second": data["tokens_per_second"].mean(),
"origin": filename,
"run_at": now,
}
]
)
additional_stats_df.to_excel(
writer, sheet_name="Additional Statistics", index=False
)
data.to_excel(writer, sheet_name="Raw Data", index=False)
# Explanation Page
explanation_data = {
"Sheet Name": [
"Overall Statistics",
"Statistics by Sample Type",
"Statistics by Model",
"Additional Statistics",
"Raw Data",
],
"Description": [
"Summary statistics of the entire dataset, including total tokens, number of samples, average elapsed time, and estimated cost.",
"Statistics broken down by sample type (doc_type), including the total number of tokens and cost estimates for each type.",
"Statistics grouped by the model used, showing token utilization, cost, and elapsed time for each model.",
"Additional aggregated metrics such as average completion tokens, prompt tokens, total tokens per sample, and processing time.",
"The raw data used for generating all the statistics, including individual completions and their details.",
],
"Columns Explained": [
"Total Tokens: Total number of tokens processed. Number of Samples: Total number of samples. Average Elapsed Time (s): Average time taken for processing. Estimated Cost (USD): Estimated cost for token usage. Average Percentage of Context Length (%): Average percentage of used context length. Min and Max Created Timestamp: The time range of the data collected. Origin: Source filename.",
"doc_type: Type of document. total_tokens: Sum of tokens per document type. num_samples: Number of samples of this type. average_elapsed_time: Average time taken per document type. average_tokens: Average tokens per sample. estimated_cost: Estimated cost for tokens of this type. average_percentage_context_length: Average percentage of context length used. filename: Source filename.",
"model: Model name. total_tokens: Total number of tokens used by the model. num_samples: Number of samples processed by the model. average_elapsed_time: Average processing time for the model. average_tokens: Average number of tokens per sample. average_percentage_context_length: Average context length percentage used. filename: Source filename. estimated_cost/cost: Cost for the tokens used by the model.",
"Average Completion Tokens: Average number of completion tokens per sample. Average Prompt Tokens: Average number of prompt tokens per sample. Average Total Tokens per Sample: Average number of total tokens per sample. Total Elapsed Time (s): Total processing time for all samples. Average Tokens per Second: Average number of tokens processed per second. origin: Source filename.",
"filename: Source filename. doc_type: Type of document. elapsed_time: Time taken for each document. usage: Token usage details (completion and prompt). created: Timestamp of creation. model: Model used.",
],
}
explanation_df = pd.DataFrame(explanation_data)
explanation_df.to_excel(writer, sheet_name="Explanation", index=False)
# Display Overall Statistics
overall_stats_df_display = pd.DataFrame(
[
{
"Total Tokens": total_tokens,
"Number of Samples": num_samples,
"Average Elapsed Time (s)": average_elapsed_time,
"Estimated Cost (USD)": estimated_cost,
"Average Percentage of Context Length (%)": average_percentage_context_length,
"Min Created Timestamp": min_created,
"Max Created Timestamp": max_created,
"origin": filename,
"run_at": now,
}
]
)
print("\nOverall Statistics:")
print(overall_stats_df_display.to_string(index=False))
# Display Statistics by Sample Type
print("\nStatistics by Sample Type (doc_type):")
print(stats_by_doc_type.to_string(index=False))
# Display Statistics by Model
print("\nStatistics by Model:")
print(stats_by_model.to_string(index=False))
# Additional Statistics
additional_stats_df_display = pd.DataFrame(
[
{
"Average Completion Tokens": data["completion_tokens"].mean(),
"Average Prompt Tokens": data["prompt_tokens"].mean(),
"Average Total Tokens per Sample": data["total_tokens"].mean(),
"Total Elapsed Time (s)": data["elapsed_time"].sum(),
"Average Tokens per Second": data["tokens_per_second"].mean(),
"origin": filename,
"run_at": now,
}
]
)
print("\nAdditional Statistics:")
print(additional_stats_df_display.to_string(index=False))
# Add similarity_classification based on similarity_score
def classify_similarity(score):
if score == 1.0:
return "identical"
elif score >= 0.9:
return "close-match"
else:
return "not-sure"
# Modify the highlight_similarity function to use three colors
def highlight_similarity(val):
if val == "identical":
color = "green"
elif val == "close-match":
color = "yellow"
else:
color = "red"
return f"background-color: {color}"
def create_df_elements_results(similarity_elements_results):
# Build the dataframe
df_results = pd.DataFrame(similarity_elements_results)
df_results["similarity_classification"] = df_results["similarity_score"].apply(
classify_similarity
)
df_results["classification_match"] = (
df_results["classification_pred"] == df_results["classification_true"]
)
df_results["classification_match_label"] = df_results["classification_match"].map(
{True: "match", False: "mismatch"}
)
df_results["source_match"] = df_results["source_pred"] == df_results["source_true"]
df_results["source_match_label"] = df_results["source_match"].map(
{True: "match", False: "mismatch"}
)
df_results["id_match"] = df_results["id_pred"] == df_results["id_true"]
df_results["id_match_label"] = df_results["id_match"].map(
{True: "match", False: "mismatch"}
)
return df_results
class JudgeStatement(BaseModel):
doc_id: str = Field(..., description="Document ID associated with the statement.")
statement_id: str = Field(
...,
description="A provided string that identifies the statement. e.g., '1', 'Person'.",
)
statement: str = Field(..., description="The statement to be transformed.")
sources: List[str] = Field(..., description="Sources of the statement.")
semscore: float = Field(..., description="just a copy from input semscore.")
similarity_score: float = Field(
...,
description="Similarity score between the original and transformed sentences.",
)
similarity_score_confidence: float = Field(
..., description="Confidence score for the similarity score."
)
transformation_accuracy: float = Field(
..., description="Accuracy score for the transformation."
)
grammar_syntax_accuracy: float = Field(
..., description="Accuracy score for the grammar and syntax."
)
findings: List[str] = Field(..., description="List of findings.")
class JudgeStatements(BaseModel):
JudgeStatements: List[JudgeStatement] = Field(
..., description="List of judge statements."
)
def get_prompts_for_judge(rules, data_dir):
rule_template_provider = RulesTemplateProvider(data_dir)
system_prompts = []
user_prompts = []
for rule in rules:
element_name = rule.get("element_name")
if element_name == ["Term", "Name"]:
statement_key = "definition"
statement_id_key = "signifier"
else:
statement_key = "statement"
statement_id_key = "statement_id"
user_prompt = get_user_prompt_judge_sentence_similarity(element_name, rule)
user_prompts.append(user_prompt)
rule_templates_subtemplates = rule_template_provider.get_rules_template(
rule["templates_ids"]
)
system_prompt = get_system_prompt_judge_sentence_similarity(
rule_templates_subtemplates
)
system_prompts.append(system_prompt)
logger.debug(system_prompt)
logger.debug(user_prompt)
logger.info(f"System prompts for {element_name}s: {len(system_prompts)}")
logger.info(f"User prompts for {element_name}s: {len(user_prompts)}")
return system_prompts, user_prompts, element_name
def evaluate_statement(element_name, user_prompts, system_prompts, manager):
# Initialize an empty list to accumulate all responses
all_responses = []
elapse_times = []
completions = []
# Loop through each pair of user and system prompts with a counter
for index, (user_prompt, system_prompt) in enumerate(
zip(user_prompts, system_prompts), start=1
):
logger.info(f"Processing evaluation prompt {index} for {element_name}.")
logger.debug(system_prompt)
logger.debug(user_prompt)
# Query the language model
response, completion, elapse_time = query_instruct_llm(
system_prompt=system_prompt,
user_prompt=user_prompt,
document_model=JudgeStatements,
llm_model=config["LLM"]["MODEL"],
temperature=config["LLM"]["TEMPERATURE"],
max_tokens=config["LLM"]["MAX_TOKENS"],
)
logger.debug(f"{response}")
# Accumulate the responses in the list
all_responses.extend(response.JudgeStatements)
elapse_times.append(elapse_time)
completions.append(completion.dict())
logger.info(f"Finished processing evaluation {index}.")
logger.info("Waiting 2s before processing the next prompt to avoid rate limits")
time.sleep(2)
# After the loop, create a single Document with all the accumulated responses
doc = Document(
id=f"validation_judge_{element_name.replace(' ', '_')}s",
type="llm_validation",
content=all_responses,
elapsed_times=elapse_times,
completions=completions,
)
manager.add_document(doc)
logger.info(f"{element_name}s: {len(all_responses)}")
return all_responses
def get_embedding(text, model="text-embedding-3-large"):
client = OpenAI()
text = text.replace("\n", " ")
return client.embeddings.create(input=[text], model=model).data[0].embedding
def cosine_similarity(embedding1, embedding2):
# Calcula a similaridade de cosseno entre dois embeddings
embedding1 = np.array(embedding1)
embedding2 = np.array(embedding2)
cos_sim = np.dot(embedding1, embedding2) / (
np.linalg.norm(embedding1) * np.linalg.norm(embedding2)
)
return cos_sim
def compare_sentences(sentence1, sentence2):
# Obtem embeddings para as duas frases
embedding1 = get_embedding(sentence1)
embedding2 = get_embedding(sentence2)
# Calcula a similaridade de cosseno entre os embeddings
# similarity = cosine_similarity(embedding1, embedding2)
similarity = 1 - cosine(embedding1, embedding2)
return similarity
# Function to calculate Intraclass Correlation Coefficient (ICC) using statsmodels
def calculate_icc(data):
"""
Calculate the Intraclass Correlation Coefficient (ICC) using statsmodels.
data: pandas DataFrame with 'semscore' and 'similarity_score' columns.
"""
data = data[["semscore", "similarity_score"]].dropna().reset_index(drop=True)
data["subject"] = data.index
ratings = data.melt(id_vars=["subject"], var_name="rater", value_name="score")
model = sm.MixedLM.from_formula(
"score ~ 1", groups="subject", re_formula="1", data=ratings
)
result = model.fit()
var_components = result.cov_re.iloc[0, 0]
residual = result.scale
icc_value = var_components / (var_components + residual)
return icc_value
# Function to calculate descriptive statistics
def calculate_descriptive_stats(series):
stats = series.describe()
stats["range"] = stats["max"] - stats["min"]
stats["IQR"] = stats["75%"] - stats["25%"]
return stats
# Function to plot Bland-Altman plot
def plot_bland_altman(df, score1, score2, title, output_dir, filename):
mean_scores = (df[score1] + df[score2]) / 2
diff_scores = df[score1] - df[score2]
diff_mean = diff_scores.mean()
diff_std = diff_scores.std()
loa_upper = diff_mean + 1.96 * diff_std
loa_lower = diff_mean - 1.96 * diff_std
plt.figure(figsize=(8, 6))
plt.scatter(mean_scores, diff_scores, alpha=0.7)
plt.axhline(diff_mean, color="red", linestyle="--")
plt.axhline(loa_upper, color="grey", linestyle="--")
plt.axhline(loa_lower, color="grey", linestyle="--")
plt.xlabel("Mean of Scores")
plt.ylabel("Difference of Scores")
plt.title(title)
plt.grid(True)
plt.show()
# Save plot
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot scatter plot
def plot_scatter(df, x_col, y_col, title, xlabel, ylabel, output_dir, filename):
plt.figure(figsize=(8, 6))
plt.scatter(df[x_col], df[y_col], alpha=0.7)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.grid(True)
plt.show()
# Save plot
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot histogram
def plot_histogram(series, title, xlabel, output_dir, filename):
plt.figure(figsize=(8, 6))
series.hist(bins=20)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
# Save plot
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot box plot
def plot_boxplot(series, title, ylabel, output_dir, filename):
plt.figure(figsize=(8, 6))
series.plot.box()
plt.title(title)
plt.ylabel(ylabel)
plt.grid(True)
plt.show()
# Save plot
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot density plot
def plot_density(series, title, xlabel, output_dir, filename):
plt.figure(figsize=(8, 6))
series.plot(kind="kde")
plt.title(title)
plt.xlabel(xlabel)
plt.grid(True)
plt.show()
# Save plot
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to count scores above a threshold
def count_scores_above_threshold(series, threshold):
count = (series > threshold).sum()
return count
# Function to plot heatmap
def plot_heatmap(df, title, output_dir, filename):
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title(title)
plt.show()
# Save plot
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path, bbox_inches="tight")
plt.close()
return plot_path
# Function to plot Q-Q plot
def plot_qqplot(series, title, output_dir, filename):
plt.figure(figsize=(8, 6))
# Gerar o Q-Q plot
stats.probplot(series, dist="norm", plot=plt)
plt.title(title)
plt.grid(True)
# Exibir o gráfico
plt.show()
# Salvar o gráfico
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Main function to process all elements
def process_all_elements(element_data, output_dir):
logger.info(f"Processing All Elements\n{'-'*40}")
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)
# Define the path for the Excel file
excel_file_path = os.path.join(output_dir, "combined_analysis_results.xlsx")
# Create an Excel writer using XlsxWriter as the engine
writer = pd.ExcelWriter(excel_file_path, engine="xlsxwriter")
workbook = writer.book
# List to store DataFrames for combining data later
combined_df_list = []
# List to keep track of image filenames for cleanup
image_files = []
# Loop through each element type in element_data
for element_name, content in element_data.items():
element_type = element_name # e.g., 'Operative_Rules'
logger.info(f"\nProcessing Element Type: {element_type}")
df = pd.DataFrame(content)
# Ensure numeric columns are of float type
numeric_cols = [
"semscore",
"similarity_score",
"similarity_score_confidence",
"transformation_accuracy",
"grammar_syntax_accuracy",
]
df[numeric_cols] = df[numeric_cols].astype(float)
# Add a column for element type
df["element_type"] = element_type
# Append to combined data list
combined_df_list.append(df)
# Perform analysis on this element type
sheet_name = element_type[:31] # Sheet names have a max length of 31 characters
worksheet = workbook.add_worksheet(sheet_name)
writer.sheets[sheet_name] = worksheet
# Reset row counter for each sheet
row = 0
# Descriptive Statistics
semscore_stats = calculate_descriptive_stats(df["semscore"])
similarity_score_stats = calculate_descriptive_stats(df["similarity_score"])
# Correlations
pearson_corr = df["semscore"].corr(df["similarity_score"])
spearman_corr = df["semscore"].corr(df["similarity_score"], method="spearman")
# Display statistics in the notebook
print("Semscore Statistics:")
display(semscore_stats.to_frame())
print("\nSimilarity Score Statistics:")
display(similarity_score_stats.to_frame())
print(
f"\nPearson Correlation between semscore and similarity_score: {pearson_corr:.4f}"
)
print(
f"Spearman Correlation between semscore and similarity_score: {spearman_corr:.4f}"
)
# Write Semscore Statistics to Excel
semscore_stats_df = semscore_stats.to_frame(name="Semscore Statistics")
semscore_stats_df.to_excel(
writer, sheet_name=sheet_name, startrow=row, startcol=0
)
row += len(semscore_stats_df) + 3 # Increment row for next section
# Write Similarity Score Statistics to Excel
similarity_score_stats_df = similarity_score_stats.to_frame(
name="Similarity Score Statistics"
)
similarity_score_stats_df.to_excel(
writer, sheet_name=sheet_name, startrow=row, startcol=0
)
row += len(similarity_score_stats_df) + 3
# Write Correlations to Excel
worksheet.write(
row, 0, "Pearson Correlation between semscore and similarity_score"
)
worksheet.write(row, 1, pearson_corr)
row += 1
worksheet.write(
row, 0, "Spearman Correlation between semscore and similarity_score"
)
worksheet.write(row, 1, spearman_corr)
row += 3
# Intraclass Correlation Coefficient (ICC)
icc_value = calculate_icc(df[["semscore", "similarity_score"]])
print(
f"\nIntraclass Correlation Coefficient (ICC) between semscore and similarity_score: {icc_value:.4f}"
)
worksheet.write(row, 0, "Intraclass Correlation Coefficient (ICC)")
worksheet.write(row, 1, icc_value)
row += 3
# Q-Q Plot for similarity_score
plot_filename = f"qqplot_similarity_score_{element_type}.png"
plot_path = plot_qqplot(
df["similarity_score"],
f"Q-Q Plot of similarity_score - {element_type}",
output_dir,
plot_filename,
)
# Bland-Altman Plot
plot_filename = f"bland_altman_{element_type}.png"
plot_path = plot_bland_altman(
df,
"semscore",
"similarity_score",
f"Bland-Altman Plot - {element_type}",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Scatter Plot
plot_filename = f"scatter_semscore_similarity_{element_type}.png"
plot_path = plot_scatter(
df,
"semscore",
"similarity_score",
f"Semscore vs Similarity Score - {element_type}",
"Semscore",
"Similarity Score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Histograms for semscore
plot_filename = f"histogram_semscore_{element_type}.png"
plot_path = plot_histogram(
df["semscore"],
f"Histogram of Semscore - {element_type}",
"Semscore",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Box Plot for semscore
plot_filename = f"boxplot_semscore_{element_type}.png"
plot_path = plot_boxplot(
df["semscore"],
f"Box Plot of Semscore - {element_type}",
"Semscore",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Box Plot for similarity_score
plot_filename = f"boxplot_similarity_score_{element_type}.png"
plot_path = plot_boxplot(
df["similarity_score"],
f"Box Plot of similarity_score - {element_type}",
"similarity_score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Density Plot for semscore
plot_filename = f"density_semscore_{element_type}.png"
plot_path = plot_density(
df["semscore"],
f"Density Plot of Semscore - {element_type}",
"Semscore",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Density Plot for similarity_score
plot_filename = f"density_similarity_score_{element_type}.png"
plot_path = plot_density(
df["similarity_score"],
f"Density Plot of similarity_score - {element_type}",
"similarity_score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Counts of Scores Above Threshold
threshold = 0.8
count_above_threshold = count_scores_above_threshold(
df["transformation_accuracy"], threshold
)
print(
f"\nCount of Transformation Accuracy scores above {threshold}: {count_above_threshold}"
)
worksheet.write(row, 0, f"Count of Transformation Accuracy > {threshold}")
worksheet.write(row, 1, count_above_threshold)
row += 3
# Heatmap of Correlation Matrix
plot_filename = f"heatmap_{element_type}.png"
plot_path = plot_heatmap(
df[numeric_cols],
f"Correlation Matrix Heatmap - {element_type}",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 25
image_files.append(plot_path)
# Save the DataFrame with the original data to a separate sheet
data_sheet_name = f"{sheet_name}_Data"[:31]
df.to_excel(writer, sheet_name=data_sheet_name, index=False)
# Combine all DataFrames
combined_df = pd.concat(combined_df_list, ignore_index=True)
# Perform combined analysis
print("\nProcessing Combined Data")
sheet_name = "Combined_Analysis"
worksheet = workbook.add_worksheet(sheet_name)
writer.sheets[sheet_name] = worksheet
row = 0
# Ensure numeric columns are of float type
combined_df[numeric_cols] = combined_df[numeric_cols].astype(float)
# Descriptive Statistics for combined semscore
semscore_stats = calculate_descriptive_stats(combined_df["semscore"])
similarity_score_stats = calculate_descriptive_stats(
combined_df["similarity_score"]
)
# Correlations
pearson_corr = combined_df["semscore"].corr(combined_df["similarity_score"])
spearman_corr = combined_df["semscore"].corr(
combined_df["similarity_score"], method="spearman"
)
# Display statistics in the notebook
print("Combined Semscore Statistics:")
display(semscore_stats.to_frame())
print("\nCombined Similarity Score Statistics:")
display(similarity_score_stats.to_frame())
print(
f"\nCombined Pearson Correlation between semscore and similarity_score: {pearson_corr:.4f}"
)
print(
f"Combined Spearman Correlation between semscore and similarity_score: {spearman_corr:.4f}"
)
# Write Semscore Statistics to Excel
semscore_stats_df = semscore_stats.to_frame(name="Combined Semscore Statistics")
semscore_stats_df.to_excel(writer, sheet_name=sheet_name, startrow=row, startcol=0)
row += len(semscore_stats_df) + 3
# Write Similarity Score Statistics to Excel
similarity_score_stats_df = similarity_score_stats.to_frame(
name="Combined Similarity Score Statistics"
)
similarity_score_stats_df.to_excel(
writer, sheet_name=sheet_name, startrow=row, startcol=0
)
row += len(similarity_score_stats_df) + 3
# Write Correlations to Excel
worksheet.write(
row, 0, "Combined Pearson Correlation between semscore and similarity_score"
)
worksheet.write(row, 1, pearson_corr)
row += 1
worksheet.write(
row, 0, "Combined Spearman Correlation between semscore and similarity_score"
)
worksheet.write(row, 1, spearman_corr)
row += 3
# Intraclass Correlation Coefficient (ICC) for Combined Data
icc_value = calculate_icc(combined_df[["semscore", "similarity_score"]])
print(
f"\nCombined Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: {icc_value:.4f}"
)
worksheet.write(row, 0, "Combined Intraclass Correlation Coefficient (ICC)")
worksheet.write(row, 1, icc_value)
row += 3
# Bland-Altman Plot for Combined Data
plot_filename = "combined_bland_altman.png"
plot_path = plot_bland_altman(
combined_df,
"semscore",
"similarity_score",
"Bland-Altman Plot - Combined Data",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Scatter Plot for Combined Data
plot_filename = "scatter_semscore_similarity_combined.png"
plot_path = plot_scatter(
combined_df,
"semscore",
"similarity_score",
"Semscore vs Similarity Score - Combined Data",
"Semscore",
"Similarity Score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Histograms for combined semscore
plot_filename = "histogram_semscore_combined.png"
plot_path = plot_histogram(
combined_df["semscore"],
"Histogram of Semscore - Combined Data",
"Semscore",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Box Plot for combined semscore
plot_filename = "boxplot_semscore_combined.png"
plot_path = plot_boxplot(
combined_df["semscore"],
"Box Plot of Semscore - Combined Data",
"Semscore",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Density Plot for combined semscore
plot_filename = "density_semscore_combined.png"
plot_path = plot_density(
combined_df["semscore"],
"Density Plot of Semscore - Combined Data",
"Semscore",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Histograms for combined similarity_score
plot_filename = "histogram_similarity_score_combined.png"
plot_path = plot_histogram(
combined_df["similarity_score"],
"Histogram of similarity_score - Combined Data",
"similarity_score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Box Plot for combined similarity_score
plot_filename = "boxplot_similarity_score_combined.png"
plot_path = plot_boxplot(
combined_df["similarity_score"],
"Box Plot of similarity_score - Combined Data",
"similarity_score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Density Plot for combined similarity_score
plot_filename = "density_similarity_score_combined.png"
plot_path = plot_density(
combined_df["similarity_score"],
"Density Plot of similarity_score - Combined Data",
"similarity_score",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Counts of Scores Above Threshold in Combined Data
threshold = 0.8
count_above_threshold = count_scores_above_threshold(
combined_df["transformation_accuracy"], threshold
)
print(
f"\nCombined Count of Transformation Accuracy scores above {threshold}: {count_above_threshold}"
)
worksheet.write(row, 0, f"Combined Count of Transformation Accuracy > {threshold}")
worksheet.write(row, 1, count_above_threshold)
row += 3
# Heatmap of Correlation Matrix for Combined Data
plot_filename = "heatmap_combined.png"
plot_path = plot_heatmap(
combined_df[numeric_cols],
"Correlation Matrix Heatmap - Combined Data",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 25
image_files.append(plot_path)
# Save the combined DataFrame to a separate sheet
combined_df.to_excel(writer, sheet_name="Combined_Data", index=False)
# Close the writer and save the Excel file
writer.close()
print(f"Analysis saved to '{excel_file_path}'")
# Clean up the plot images after saving the workbook
for image_file in image_files:
if os.path.exists(image_file):
os.remove(image_file)
return combined_df
Datasets¶
From section 7.2.4 Datasets
The dataset of the previous algorithm was adjusted with the gold standard dataset. The goal is to reduce the accumulation of errors from one step to the next.
The data adjusted:
- § 275.0-2_P1, § 275.0-2_P2
- § 275.0-5_P1, § 275.0-5_P2
- § 275.0-7_P1, § 275.0-7_P2
True tables¶
There are no true tables to evaluate the transformation, the evaluation depends on the algorithms SEMSCORE and "LLM as a Judge".
Predicted values¶
Get predicted elements from all runs
(
pred_operative_rules_classify,
pred_facts_classify,
pred_terms_classify,
pred_names_classify,
pred_files_classify,
) = get_elements_from_checkpoints(
config["DEFAULT_CHECKPOINT_DIR"], merge=True, filter="non_null"
)
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json 2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json 2024-12-14 13:12:24 - INFO - Rules to evaluate: 60 2024-12-14 13:12:24 - INFO - Facts to evaluate: 160 2024-12-14 13:12:24 - INFO - Terms to evaluate: 280 2024-12-14 13:12:24 - INFO - Names to evaluate: 50
[{'filename': 'documents-2024-12-08-1.json', 'date': '2024-12-08', 'number': 1}, {'filename': 'documents-2024-12-08-10.json', 'date': '2024-12-08', 'number': 10}, {'filename': 'documents-2024-12-08-2.json', 'date': '2024-12-08', 'number': 2}, {'filename': 'documents-2024-12-08-3.json', 'date': '2024-12-08', 'number': 3}, {'filename': 'documents-2024-12-08-4.json', 'date': '2024-12-08', 'number': 4}, {'filename': 'documents-2024-12-08-5.json', 'date': '2024-12-08', 'number': 5}, {'filename': 'documents-2024-12-08-6.json', 'date': '2024-12-08', 'number': 6}, {'filename': 'documents-2024-12-08-7.json', 'date': '2024-12-08', 'number': 7}, {'filename': 'documents-2024-12-08-8.json', 'date': '2024-12-08', 'number': 8}, {'filename': 'documents-2024-12-08-9.json', 'date': '2024-12-08', 'number': 9}]
Set dataset to evaluation and check empty transformed elements
data = (
pred_facts_classify,
pred_terms_classify,
pred_names_classify,
pred_operative_rules_classify,
)
data_names = ("pred_facts", "pred_terms", "pred_names", "pred_operative_rules")
for element_list, element_name in zip(data, data_names):
empty_transformed_elements = [
element for element in element_list if not element.get("transformed")
]
logger.info(
f"Empty transformed {element_name}: {len(empty_transformed_elements)}/{len(element_list)}"
)
2024-12-14 13:12:24 - INFO - Empty transformed pred_facts: 0/160 2024-12-14 13:12:24 - INFO - Empty transformed pred_terms: 0/280 2024-12-14 13:12:24 - INFO - Empty transformed pred_names: 0/50 2024-12-14 13:12:24 - INFO - Empty transformed pred_operative_rules: 0/60 2024-12-14 13:12:24 - INFO - Empty transformed pred_terms: 0/280 2024-12-14 13:12:24 - INFO - Empty transformed pred_names: 0/50 2024-12-14 13:12:24 - INFO - Empty transformed pred_operative_rules: 0/60
for element_list, element_name in zip(data, data_names):
element_df = pd.DataFrame(element_list)
mi.matrix(element_df, figsize=(10, 5))
plt.title(f"Missing Values for {element_name}")
Algorithms¶
Validation of algorithm from section 6.2 Implementation of main components
Source for section 7.3 Results
nlp2sbvr¶
Elements measurements from chapter 7.2.3 Terms, names, facts, and operative rules
Measuring similarity with SEMSCORE¶
Evaluating SEMSCORE (AYNETDINOV;AKBIK, 2024) between the predicted and true statements for each element.
WARNING: Expensive operation!
If the data is available could skip processing evaluation. Operation is expensive, if just need to compile the evaluation, set SKIP to True.
SKIP = True
if not SKIP:
for element_list, data_name in zip(data, data_names):
for element in element_list:
# Check if 'semscore' is already calculated
if "semscore" not in element or element["semscore"] is None:
original_sentence = element.get("statement", element.get("definition"))
transformed_sentence = element.get("transformed")
templates_ids = element.get("templates_ids")
element_name = element.get("element_name")
# Remove keys if they exist
for key in [
"explanation",
"confidence",
"subtype_confidence",
"subtype_explanation",
]:
element.pop(key, None) # Using pop with None to avoid KeyError
logger.debug(f"{element_name=}")
logger.debug(
f"{data_name} - {element['statement_id']}: {element['doc_id']}{element['sources']}\nOriginal Sentence: {original_sentence}\nTransformed Sentence: {transformed_sentence}\ntemplates: {templates_ids}\n"
)
logger.debug(f"{element=}")
# Calculate similarity score
similarity = compare_sentences(original_sentence, transformed_sentence)
logger.info(
f"element: {element_name}, similarity score: {similarity}\n"
)
# Assign the calculated score to 'semscore'
element["semscore"] = similarity
logger.debug(f"element: {element}\n")
else:
logger.debug(
f"{element.get('element_name')} already has a semscore: {element['semscore']}"
)
Check if SEMSCORE was calculated.
# Check for 'semscore' in each list of dictionaries
semscore_in_operative_rules = all(
"semscore" in item and item["semscore"] is not None
for item in pred_operative_rules_classify
)
semscore_in_facts = all(
"semscore" in item and item["semscore"] is not None for item in pred_facts_classify
)
semscore_in_terms = all(
"semscore" in item and item["semscore"] is not None for item in pred_terms_classify
)
semscore_in_names = all(
"semscore" in item and item["semscore"] is not None for item in pred_names_classify
)
# Log information including whether 'semscore' exists and is not None
logger.info(
f"Operative Rules to evaluate: {len(pred_operative_rules_classify)}, semscore was calculated: {semscore_in_operative_rules}"
)
logger.info(
f"Facts to evaluate: {len(pred_facts_classify)}, semscore was calculated: {semscore_in_facts}"
)
logger.info(
f"Terms to evaluate: {len(pred_terms_classify)}, semscore was calculated: {semscore_in_terms}"
)
logger.info(
f"Names to evaluate: {len(pred_names_classify)}, semscore was calculated: {semscore_in_names}"
)
2024-12-14 13:12:26 - INFO - Operative Rules to evaluate: 60, semscore was calculated: True 2024-12-14 13:12:26 - INFO - Facts to evaluate: 160, semscore was calculated: True 2024-12-14 13:12:26 - INFO - Terms to evaluate: 280, semscore was calculated: True 2024-12-14 13:12:26 - INFO - Names to evaluate: 50, semscore was calculated: True 2024-12-14 13:12:26 - INFO - Facts to evaluate: 160, semscore was calculated: True 2024-12-14 13:12:26 - INFO - Terms to evaluate: 280, semscore was calculated: True 2024-12-14 13:12:26 - INFO - Names to evaluate: 50, semscore was calculated: True
Evaluation criterias (SHANKAR et al., 2024)¶
Based on the prompt, there are three inferred evaluation criteria that align with the approach proposed by EvalGen (SHANKAR et al., 2024):
Similarity Score
- Given the original_sentence and tranformed_sentence, how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?
Transformation Accuracy
- From 0 to 1, how does the "transformed_sentence" reflect the original_sentence with the structure and phrasing provided by the template?
Grammar and Syntax Accuracy
- How is the transformed sentence grammatically correct and syntactically accurate from 0 to 1?
LLM-as-a-judge¶
References of the LLM-as-a-judge approach: (WEI; CHEN; LUO, 2024), (DONG; HU; COLLIER, 2024), (ZHENG et al., 2023)
Prompt engineering¶
System prompt
def get_system_prompt_judge_sentence_similarity(template):
return f"""
# Task
You're an expert in judging sentence similarity and transformation using a template.
These criteria should support the evaluation process by verifying classification accuracy, template application, and transformation fidelity.
Check the criteria and evaluate the output:
1. **Similarity Score**
- Given the statement or definition and tranformed sentence (transformed), how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?
2. **Transformation Accuracy**
- From 0 to 1, how does the transformed sentence (transformed) reflect the original sentence (statement or definition) with the structure and phrasing provided by the template and subtemplates?
3. **Grammar and Syntax Accuracy**
- How is the transformed sentence (transformed) grammatically correct and syntactically accurate from 0 to 1?
# Output Format
Record your evaluation in JSON format as follows:
```json
{{
"doc_id": "<Document ID>",
"statement_id": "<Statement ID>",
"sources": ["<source>"],
"similarity_score": <Similarity score>,
"similarity_score_confidence": <Confidence score>,
"transformation_accuracy": <Transformation score>,
"grammar_syntax_accuracy": <Grammar score>,
"findings": ["<Things found during the evaluation and worth to be mentioned>",
"<other things to mention>"
],
"semscore": <original semscore>
}}
```
# Input example
{{
"doc_id": <Document ID>,
"statement_id": <Statement ID>,
"statement or definition": <original sentence>,
"sources": [<source>],
"terms": [
{{"term": <signifier>, "classification": <Proper or Common Noun>}},
...
],
"verb_symbols": <verbs or phrasal verbs>,
"element_name": <name of element: Name, Term, Fact, Fact Type, Operative Rule>,
"transformed": <transformed sentence>,
"type": <type of element: Definitional, Activity, Party, Data>,
"subtype": <subtype of element>,
"templates_ids": ["T8"],
"semscore": <semscore>
}}
# Templates and Subtemplates
{template}
"""
User prompt
def get_user_prompt_judge_sentence_similarity(element_name, rule):
return f"""
# rule data for an element: {element_name}
{json.dumps(rule, indent=2)}
"""
Measuring similarity with LLM Judge¶
Preparing system and user prompts for each element and call the judge.
if not SKIP:
for element_list, data_name in zip(data, data_names):
system_prompts, user_prompts, element_name = get_prompts_for_judge(
element_list, config["DEFAULT_DATA_DIR"]
)
responses = evaluate_statement(
element_name=element_name,
user_prompts=user_prompts,
system_prompts=system_prompts,
manager=manager,
)
logger.debug(f"{responses=}")
# Persist the state to a file
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)
Average similarity score per document 5s.
Elements evaluation¶
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json 2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
eval_operative_rules = []
eval_facts = []
eval_terms = []
eval_names = []
for manager, file_info in zip(managers, file_info_list):
# Process documents
eval_operative_rules.extend(manager.retrieve_document(
"validation_judge_Operative_Rules", "llm_validation"
).content)
eval_names.extend(manager.retrieve_document(
"validation_judge_Names", "llm_validation"
).content)
eval_terms.extend(manager.retrieve_document(
"validation_judge_Terms", "llm_validation"
).content)
eval_facts.extend(manager.retrieve_document(
"validation_judge_Fact_Types", "llm_validation"
).content)
logger.info(f"Operative Rules: {len(eval_operative_rules)}")
logger.info(f"Names: {len(eval_names)}")
logger.info(f"Terms: {len(eval_terms)}")
logger.info(f"Facts: {len(eval_facts)}")
2024-12-14 13:12:26 - INFO - Operative Rules: 60 2024-12-14 13:12:26 - INFO - Names: 50 2024-12-14 13:12:26 - INFO - Terms: 280 2024-12-14 13:12:26 - INFO - Facts: 160 2024-12-14 13:12:26 - INFO - Names: 50 2024-12-14 13:12:26 - INFO - Terms: 280 2024-12-14 13:12:26 - INFO - Facts: 160
#manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])
elements_data = {
"Operative_Rules": eval_operative_rules,
"Names": eval_names,
"Terms": eval_terms,
"Fact_Types": eval_facts,
}
for key in elements_data.keys():
logger.info(f"{key}: {len(elements_data[key])}")
2024-12-14 13:12:26 - INFO - Operative_Rules: 60 2024-12-14 13:12:26 - INFO - Names: 50 2024-12-14 13:12:26 - INFO - Terms: 280 2024-12-14 13:12:26 - INFO - Fact_Types: 160 2024-12-14 13:12:26 - INFO - Names: 50 2024-12-14 13:12:26 - INFO - Terms: 280 2024-12-14 13:12:26 - INFO - Fact_Types: 160
Checking missing data
for element_key in elements_data.keys():
element_df = pd.DataFrame(elements_data[key])
mi.matrix(element_df, figsize=(10, 5))
plt.title(f"Missing Values for {key}")
Process all metrics
df = process_all_elements(elements_data, config["DEFAULT_OUTPUT_DIR"])
2024-12-14 13:12:27 - INFO - Processing All Elements ---------------------------------------- 2024-12-14 13:12:27 - INFO - Processing Element Type: Operative_Rules
Semscore Statistics:
| semscore | |
|---|---|
| count | 60.000000 |
| mean | 0.940934 |
| std | 0.017991 |
| min | 0.904715 |
| 25% | 0.921666 |
| 50% | 0.945994 |
| 75% | 0.952143 |
| max | 0.961199 |
| range | 0.056484 |
| IQR | 0.030477 |
Similarity Score Statistics:
| similarity_score | |
|---|---|
| count | 60.000000 |
| mean | 0.905833 |
| std | 0.036928 |
| min | 0.850000 |
| 25% | 0.900000 |
| 50% | 0.900000 |
| 75% | 0.950000 |
| max | 0.950000 |
| range | 0.100000 |
| IQR | 0.050000 |
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:1634: UserWarning: Random effects covariance is singular warnings.warn(msg)
Pearson Correlation between semscore and similarity_score: 0.1253 Spearman Correlation between semscore and similarity_score: 0.1281 Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.0000
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:1634: UserWarning: Random effects covariance is singular warnings.warn(msg) /home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space. warnings.warn(msg, ConvergenceWarning)
Count of Transformation Accuracy scores above 0.8: 33
2024-12-14 13:12:29 - INFO - Processing Element Type: Names
Semscore Statistics:
| semscore | |
|---|---|
| count | 50.000000 |
| mean | 0.800785 |
| std | 0.057943 |
| min | 0.678210 |
| 25% | 0.785631 |
| 50% | 0.820770 |
| 75% | 0.850337 |
| max | 0.850504 |
| range | 0.172295 |
| IQR | 0.064706 |
Similarity Score Statistics:
| similarity_score | |
|---|---|
| count | 50.000000 |
| mean | 0.940000 |
| std | 0.020203 |
| min | 0.900000 |
| 25% | 0.950000 |
| 50% | 0.950000 |
| 75% | 0.950000 |
| max | 0.950000 |
| range | 0.050000 |
| IQR | 0.000000 |
Pearson Correlation between semscore and similarity_score: 0.1313 Spearman Correlation between semscore and similarity_score: 0.3465
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with lbfgs
warnings.warn(
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with cg
warnings.warn(
Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.1691
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2206: ConvergenceWarning: MixedLM optimization failed, trying a different optimizer may help.
warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2218: ConvergenceWarning: Gradient optimization failed, |grad| = 24.388920
warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2261: ConvergenceWarning: The Hessian matrix at the estimated parameter values is not positive definite.
warnings.warn(msg, ConvergenceWarning)
Count of Transformation Accuracy scores above 0.8: 50
2024-12-14 13:12:32 - INFO - Processing Element Type: Terms
Semscore Statistics:
| semscore | |
|---|---|
| count | 280.000000 |
| mean | 0.798593 |
| std | 0.068700 |
| min | 0.654959 |
| 25% | 0.755620 |
| 50% | 0.800653 |
| 75% | 0.848153 |
| max | 0.950956 |
| range | 0.295996 |
| IQR | 0.092533 |
Similarity Score Statistics:
| similarity_score | |
|---|---|
| count | 280.000000 |
| mean | 0.920000 |
| std | 0.048564 |
| min | 0.500000 |
| 25% | 0.900000 |
| 50% | 0.950000 |
| 75% | 0.950000 |
| max | 1.000000 |
| range | 0.500000 |
| IQR | 0.050000 |
Pearson Correlation between semscore and similarity_score: -0.0952 Spearman Correlation between semscore and similarity_score: -0.1062
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with lbfgs
warnings.warn(
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with cg
warnings.warn(
Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.1209
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2206: ConvergenceWarning: MixedLM optimization failed, trying a different optimizer may help.
warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2218: ConvergenceWarning: Gradient optimization failed, |grad| = 102.678696
warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2261: ConvergenceWarning: The Hessian matrix at the estimated parameter values is not positive definite.
warnings.warn(msg, ConvergenceWarning)
Count of Transformation Accuracy scores above 0.8: 204
2024-12-14 13:12:40 - INFO - Processing Element Type: Fact_Types
Semscore Statistics:
| semscore | |
|---|---|
| count | 160.000000 |
| mean | 0.906189 |
| std | 0.054946 |
| min | 0.726641 |
| 25% | 0.865581 |
| 50% | 0.919224 |
| 75% | 0.953123 |
| max | 0.993932 |
| range | 0.267291 |
| IQR | 0.087542 |
Similarity Score Statistics:
| similarity_score | |
|---|---|
| count | 160.000000 |
| mean | 0.918125 |
| std | 0.058944 |
| min | 0.700000 |
| 25% | 0.900000 |
| 50% | 0.950000 |
| 75% | 0.950000 |
| max | 0.950000 |
| range | 0.250000 |
| IQR | 0.050000 |
Pearson Correlation between semscore and similarity_score: -0.0440 Spearman Correlation between semscore and similarity_score: 0.0241 Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.0000
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space. warnings.warn(msg, ConvergenceWarning)
Count of Transformation Accuracy scores above 0.8: 121
Processing Combined Data Combined Semscore Statistics:
| semscore | |
|---|---|
| count | 550.000000 |
| mean | 0.845621 |
| std | 0.083549 |
| min | 0.654959 |
| 25% | 0.781917 |
| 50% | 0.848162 |
| 75% | 0.919216 |
| max | 0.993932 |
| range | 0.338973 |
| IQR | 0.137299 |
Combined Similarity Score Statistics:
| similarity_score | |
|---|---|
| count | 550.000000 |
| mean | 0.919727 |
| std | 0.049468 |
| min | 0.500000 |
| 25% | 0.900000 |
| 50% | 0.950000 |
| 75% | 0.950000 |
| max | 1.000000 |
| range | 0.500000 |
| IQR | 0.050000 |
Combined Pearson Correlation between semscore and similarity_score: -0.1070 Combined Spearman Correlation between semscore and similarity_score: -0.0679
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with lbfgs
warnings.warn(
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
warnings.warn(msg, ConvergenceWarning)
Combined Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.0236
Combined Count of Transformation Accuracy scores above 0.8: 408
Analysis saved to '../outputs/combined_analysis_results.xlsx'
Metrics¶
# Updated functions for side-by-side plotting and color customization
# Function to plot histogram for semscore and similarity_score side-by-side
def plot_histogram_side_by_side(df, title, xlabel, output_dir, filename):
plt.figure(figsize=(12, 6))
plt.hist(df["semscore"], bins=20, color="#D55E00", alpha=0.7, label="Semscore", linestyle="--", edgecolor="black")
plt.hist(df["similarity_score"], bins=20, color="#0072B2", alpha=0.7, label="Similarity Score", linestyle="-", edgecolor="black")
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot box plot for semscore and similarity_score side-by-side
def plot_boxplot_side_by_side(df, title, ylabel, output_dir, filename):
plt.figure(figsize=(8, 6))
boxplot = plt.boxplot(
[df["semscore"].dropna(), df["similarity_score"].dropna()],
labels=["Semscore", "Similarity Score"],
patch_artist=True,
boxprops=dict(color="black"),
medianprops=dict(color="black"),
capprops=dict(color="black"),
whiskerprops=dict(color="black"),
)
colors = ["#D55E00", "#0072B2"]
for patch, color in zip(boxplot['boxes'], colors):
patch.set_facecolor(color)
plt.title(title)
plt.ylabel(ylabel)
plt.grid(True)
plt.show()
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot density plot for semscore and similarity_score side-by-side
def plot_density_side_by_side(df, title, xlabel, output_dir, filename):
plt.figure(figsize=(12, 6))
df["semscore"].plot(kind="kde", color="#D55E00", alpha=0.7, linestyle="--", label="Semscore")
df["similarity_score"].plot(kind="kde", color="#0072B2", alpha=0.7, linestyle="-", label="Similarity Score")
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Updated process_all_elements function
def process_all_elements_updated(element_data, output_dir):
os.makedirs(output_dir, exist_ok=True)
excel_file_path = os.path.join(output_dir, "combined_analysis_results.xlsx")
writer = pd.ExcelWriter(excel_file_path, engine="xlsxwriter")
workbook = writer.book
combined_df_list = []
image_files = []
for element_name, content in element_data.items():
df = pd.DataFrame(content)
numeric_cols = ["semscore", "similarity_score"]
df[numeric_cols] = df[numeric_cols].astype(float)
df["element_type"] = element_name
combined_df_list.append(df)
sheet_name = element_name[:31]
worksheet = workbook.add_worksheet(sheet_name)
writer.sheets[sheet_name] = worksheet
row = 0
# Histograms side-by-side
plot_filename = f"histogram_side_by_side_{element_name}.png"
plot_path = plot_histogram_side_by_side(
df,
f"Histograms of Semscore and Similarity Score - {element_name}",
"Scores",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Boxplots side-by-side
plot_filename = f"boxplot_side_by_side_{element_name}.png"
plot_path = plot_boxplot_side_by_side(
df,
f"Boxplots of Semscore and Similarity Score - {element_name}",
"Scores",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Density plots side-by-side
plot_filename = f"density_side_by_side_{element_name}.png"
plot_path = plot_density_side_by_side(
df,
f"Density Plots of Semscore and Similarity Score - {element_name}",
"Scores",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
combined_df = pd.concat(combined_df_list, ignore_index=True)
# Combined Histograms side-by-side
plot_filename = "histogram_side_by_side_combined.png"
plot_path = plot_histogram_side_by_side(
combined_df,
"Combined Histograms of Semscore and Similarity Score",
"Scores",
output_dir,
plot_filename,
)
writer.sheets["Combined"] = workbook.add_worksheet("Combined")
writer.sheets["Combined"].insert_image(0, 0, plot_path)
image_files.append(plot_path)
# Combined Boxplots side-by-side
plot_filename = "boxplot_side_by_side_combined.png"
plot_path = plot_boxplot_side_by_side(
combined_df,
"Combined Boxplots of Semscore and Similarity Score",
"Scores",
output_dir,
plot_filename,
)
writer.sheets["Combined"].insert_image(25, 0, plot_path)
image_files.append(plot_path)
# Combined Density Plots side-by-side
plot_filename = "density_side_by_side_combined.png"
plot_path = plot_density_side_by_side(
combined_df,
"Combined Density Plots of Semscore and Similarity Score",
"Scores",
output_dir,
plot_filename,
)
writer.sheets["Combined"].insert_image(50, 0, plot_path)
image_files.append(plot_path)
writer.close()
for image_file in image_files:
if os.path.exists(image_file):
os.remove(image_file)
return combined_df
combined_df = process_all_elements_updated(elements_data, config["DEFAULT_OUTPUT_DIR"])
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
Describing the metrics semscore and similarity_score
combined_df.groupby("element_type")[["semscore", "similarity_score"]].describe()#.to_excel(config["DEFAULT_OUTPUT_DIR"] + "/sem_sim_descriptive_stats.xlsx")
| semscore | similarity_score | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| element_type | ||||||||||||||||
| Fact_Types | 160.0 | 0.906189 | 0.054946 | 0.726641 | 0.865581 | 0.919224 | 0.953123 | 0.993932 | 160.0 | 0.918125 | 0.058944 | 0.70 | 0.90 | 0.95 | 0.95 | 0.95 |
| Names | 50.0 | 0.800785 | 0.057943 | 0.678210 | 0.785631 | 0.820770 | 0.850337 | 0.850504 | 50.0 | 0.940000 | 0.020203 | 0.90 | 0.95 | 0.95 | 0.95 | 0.95 |
| Operative_Rules | 60.0 | 0.940934 | 0.017991 | 0.904715 | 0.921666 | 0.945994 | 0.952143 | 0.961199 | 60.0 | 0.905833 | 0.036928 | 0.85 | 0.90 | 0.90 | 0.95 | 0.95 |
| Terms | 280.0 | 0.798593 | 0.068700 | 0.654959 | 0.755620 | 0.800653 | 0.848153 | 0.950956 | 280.0 | 0.920000 | 0.048564 | 0.50 | 0.90 | 0.95 | 0.95 | 1.00 |
See correlation analysis below
Similarity_score and confidence
combined_df.groupby("element_type")[["similarity_score", "similarity_score_confidence"]].describe()
| similarity_score | similarity_score_confidence | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| element_type | ||||||||||||||||
| Fact_Types | 160.0 | 0.918125 | 0.058944 | 0.70 | 0.90 | 0.95 | 0.95 | 0.95 | 160.0 | 0.892188 | 0.023490 | 0.80 | 0.90 | 0.9 | 0.9 | 0.9 |
| Names | 50.0 | 0.940000 | 0.020203 | 0.90 | 0.95 | 0.95 | 0.95 | 0.95 | 50.0 | 0.891000 | 0.019404 | 0.85 | 0.90 | 0.9 | 0.9 | 0.9 |
| Operative_Rules | 60.0 | 0.905833 | 0.036928 | 0.85 | 0.90 | 0.90 | 0.95 | 0.95 | 60.0 | 0.882500 | 0.024050 | 0.85 | 0.85 | 0.9 | 0.9 | 0.9 |
| Terms | 280.0 | 0.920000 | 0.048564 | 0.50 | 0.90 | 0.95 | 0.95 | 1.00 | 280.0 | 0.884821 | 0.033760 | 0.70 | 0.85 | 0.9 | 0.9 | 1.0 |
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
lambda group: group["similarity_score"].corr(group["similarity_score_confidence"])
).reset_index(name="correlation")
/tmp/ipykernel_4986/3731941340.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
| element_type | correlation | |
|---|---|---|
| 0 | Fact_Types | 0.647984 |
| 1 | Names | 0.937043 |
| 2 | Operative_Rules | 0.116892 |
| 3 | Terms | 0.628518 |
transformation_accuracy and grammar_syntax_accuracy
combined_df.groupby("element_type")[["transformation_accuracy", "grammar_syntax_accuracy"]].describe()
| transformation_accuracy | grammar_syntax_accuracy | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| element_type | ||||||||||||||||
| Fact_Types | 160.0 | 0.869688 | 0.086892 | 0.60 | 0.85 | 0.90 | 0.9 | 0.95 | 160.0 | 0.933750 | 0.080006 | 0.60 | 0.95 | 0.95 | 0.95 | 1.00 |
| Names | 50.0 | 0.900000 | 0.010102 | 0.85 | 0.90 | 0.90 | 0.9 | 0.95 | 50.0 | 0.970000 | 0.024744 | 0.95 | 0.95 | 0.95 | 1.00 | 1.00 |
| Operative_Rules | 60.0 | 0.851667 | 0.050394 | 0.80 | 0.80 | 0.85 | 0.9 | 0.95 | 60.0 | 0.941667 | 0.018791 | 0.90 | 0.95 | 0.95 | 0.95 | 0.95 |
| Terms | 280.0 | 0.875536 | 0.075608 | 0.30 | 0.80 | 0.90 | 0.9 | 1.00 | 280.0 | 0.952143 | 0.074745 | 0.20 | 0.95 | 0.95 | 1.00 | 1.00 |
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
lambda group: group["transformation_accuracy"].corr(group["grammar_syntax_accuracy"])
).reset_index(name="correlation")
/tmp/ipykernel_4986/1485779397.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
| element_type | correlation | |
|---|---|---|
| 0 | Fact_Types | 0.756493 |
| 1 | Names | 0.204124 |
| 2 | Operative_Rules | 0.462378 |
| 3 | Terms | 0.764044 |
Correlation analysis similarity_score and semscore¶
Top 10 lowest semscore
# Make a copy of the DataFrame for further analysis
df_aval = df.copy()
df_similarity = df.copy()
df_agree = df.copy()
df_aval.nsmallest(10, ['semscore'])
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 112 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.654959 | 0.95 | 0.9 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 224 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655024 | 0.95 | 0.9 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 168 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655041 | 0.95 | 0.9 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 308 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655048 | 0.95 | 0.9 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 252 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655050 | 0.95 | 0.9 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 140 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655103 | 0.95 | 0.9 | 0.90 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 336 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655213 | 0.95 | 0.9 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 196 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655215 | 0.95 | 0.9 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 280 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655292 | 0.95 | 0.9 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 364 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655489 | 0.95 | 0.9 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
Top 10 lowest similarity_score
df_smallest=df_aval.nsmallest(100, ['similarity_score', "semscore"])
df_smallest
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 296 | § 275.0-5 | Order of the Commission | An order issued by the Commission under the Act. | [(d)] | 0.810312 | 0.5 | 0.70 | 0.3 | 0.20 | [The transformed sentence 'An order is by defi... | Terms |
| 249 | § 275.0-7 | Trust | A person is presumed to control a trust if the... | [(b)(1)(iv)] | 0.851620 | 0.7 | 0.80 | 0.6 | 0.70 | [The transformed sentence introduces the term ... | Terms |
| 333 | § 275.0-7 | Trust | A person is presumed to control a trust if the... | [(b)(1)(iv)] | 0.917104 | 0.7 | 0.80 | 0.6 | 0.50 | [The transformed sentence changes the meaning ... | Terms |
| 502 | § 275.0-2 | 1 | A person may serve process, pleadings, or othe... | [(a)] | 0.934140 | 0.7 | 0.80 | 0.6 | 0.90 | [The transformed sentence introduces 'by defin... | Fact_Types |
| 390 | § 275.0-2 | 1 | A person may serve process, pleadings, or othe... | [(a)] | 0.934150 | 0.7 | 0.80 | 0.6 | 0.70 | [The transformed sentence does not accurately ... | Fact_Types |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 177 | § 275.0-5 | Facts | Information submitted to the Commission bearin... | [(a)] | 0.759489 | 0.9 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 345 | § 275.0-5 | Facts | Information submitted to the Commission bearin... | [(a)] | 0.759489 | 0.9 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 289 | § 275.0-5 | Facts | Information submitted to the Commission bearin... | [(a)] | 0.759561 | 0.9 | 0.85 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 121 | § 275.0-5 | Facts | Information submitted to the Commission bearin... | [(a)] | 0.759566 | 0.9 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 261 | § 275.0-5 | Facts | Information submitted to the Commission bearin... | [(a)] | 0.760273 | 0.9 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
100 rows × 11 columns
df_smallest=df_aval.nsmallest(100, ['similarity_score', "semscore"])['score_difference'] = df_similarity['similarity_score'] - df_similarity['semscore']
# Convert the 'sources' column to a string type to allow dropping duplicates
df_aval['sources'] = df_aval['sources'].apply(str)
# Filter the distinct records based on doc_id, statement_id, statement, and sources
df_aval.drop_duplicates(subset=['doc_id', 'statement_id', 'statement', 'sources'])
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | § 275.0-2 | 3 | The Secretary of the Commission (Secretary) wi... | ['(a)(2)'] | 0.952598 | 0.90 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Operative_Rules |
| 1 | § 275.0-2 | 4 | If the Secretary certifies that the Commission... | ['(a)(3)'] | 0.944126 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Operative_Rules |
| 2 | § 275.0-5 | 1 | Notice of the initiation of the proceeding wil... | ['(a)'] | 0.946623 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Operative_Rules |
| 3 | § 275.0-5 | 2 | Any interested person may, within the period o... | ['(a)'] | 0.914574 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence captures the essence... | Operative_Rules |
| 4 | § 275.0-5 | 3 | An order disposing of the matter will be issue... | ['(b)'] | 0.958645 | 0.90 | 0.85 | 0.8 | 0.90 | [The transformed sentence maintains the core m... | Operative_Rules |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 405 | § 275.0-7 | 10 | Total assets means the total assets as shown o... | ['(b)(2)'] | 0.976969 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence closely follows the ... | Fact_Types |
| 415 | § 275.0-7 | 4 | An investment adviser does not control, is not... | ['(a)(3)'] | 0.963108 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Fact_Types |
| 461 | \n§ 275.0-7 | 2 | An investment adviser has assets under managem... | ['(a)(1)'] | 0.889467 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Fact_Types |
| 462 | \n§ 275.0-7 | 3 | An investment adviser did not have total asset... | ['(a)(2)'] | 0.842859 | 0.95 | 0.90 | 0.9 | 1.00 | [The transformed sentence accurately reflects ... | Fact_Types |
| 469 | \n§ 275.0-7 | 10 | Total assets means the total assets as shown o... | ['(b)(2)'] | 0.976953 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence closely follows the ... | Fact_Types |
61 rows × 11 columns
df_similarity['score_difference'] = df_similarity['similarity_score'] - df_similarity['semscore']
df_similarity
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | § 275.0-2 | 3 | The Secretary of the Commission (Secretary) wi... | [(a)(2)] | 0.952598 | 0.90 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Operative_Rules | -0.052598 |
| 1 | § 275.0-2 | 4 | If the Secretary certifies that the Commission... | [(a)(3)] | 0.944126 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Operative_Rules | 0.005874 |
| 2 | § 275.0-5 | 1 | Notice of the initiation of the proceeding wil... | [(a)] | 0.946623 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Operative_Rules | 0.003377 |
| 3 | § 275.0-5 | 2 | Any interested person may, within the period o... | [(a)] | 0.914574 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence captures the essence... | Operative_Rules | -0.064574 |
| 4 | § 275.0-5 | 3 | An order disposing of the matter will be issue... | [(b)] | 0.958645 | 0.90 | 0.85 | 0.8 | 0.90 | [The transformed sentence maintains the core m... | Operative_Rules | -0.058645 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 545 | § 275.0-7 | 6 | A person is presumed to control a corporation ... | [(b)(1)(i)(A)] | 0.785793 | 0.90 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Fact_Types | 0.114207 |
| 546 | § 275.0-7 | 7 | A person is presumed to control a partnership ... | [(b)(1)(ii)] | 0.938356 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Fact_Types | 0.011644 |
| 547 | § 275.0-7 | 8 | A person is presumed to control a limited liab... | [(b)(1)(iii)] | 0.953968 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence maintains the origin... | Fact_Types | -0.003968 |
| 548 | § 275.0-7 | 9 | A person is presumed to control a trust if the... | [(b)(1)(iv)] | 0.917051 | 0.80 | 0.80 | 0.6 | 0.70 | [The transformed sentence incorrectly suggests... | Fact_Types | -0.117051 |
| 549 | § 275.0-7 | 10 | Total assets means the total assets as shown o... | [(b)(2)] | 0.976969 | 0.95 | 0.90 | 0.9 | 0.95 | [The transformed sentence closely follows the ... | Fact_Types | -0.026969 |
550 rows × 12 columns
# Plot the semscore, similarity score, and score difference on the same graph
plt.figure(figsize=(12, 6))
plt.plot(df_similarity.index, df_similarity['semscore'], color='#D55E00', marker='x', linestyle='--', label='Semscore')
plt.plot(df_similarity.index, df_similarity['similarity_score'], color='#0072B2', marker='o', linestyle='-', label='Similarity Score')
plt.title('Semscore, and Similarity Score Across Records')
plt.xlabel('Record Index')
plt.ylabel('Scores')
plt.grid(True)
plt.legend()
plt.show()
# Plot the score difference as a line chart
plt.figure(figsize=(10, 6))
plt.plot(df_similarity.index, df_similarity['score_difference'], marker='o', linestyle='-', label='Score Difference')
plt.title('Score Difference Across Records')
plt.xlabel('Record Index')
plt.ylabel('Score Difference')
plt.grid(True)
plt.legend()
plt.show()
# Create an interactive scatter plot
fig = go.Figure()
marker_map = {
'Operative_Rules': 'circle',
'Names': 'x',
'Terms': 'triangle-up',
'Fact_Types': 'diamond'
}
# Add a trace for each element_type
unique_types = df_similarity['element_type'].unique()
for etype in unique_types:
filtered_data = df_similarity[df_similarity['element_type'] == etype]
fig.add_trace(go.Scatter(
x=filtered_data.index,
y=filtered_data['score_difference'],
mode='lines+markers',
marker=dict(symbol=marker_map[etype]), # Wrap the symbol in a dictionary
name=etype,
visible=True # Ensure all traces are visible initially
))
# Add dropdown to filter by element_type
dropdown_buttons = [
dict(label="All",
method="update",
args=[{"visible": [True] * len(unique_types)}, # Show all traces
{"title": "Score Difference - All Element Types"}]),
]
for i, etype in enumerate(unique_types):
dropdown_buttons.append(
dict(label=etype,
method="update",
args=[{"visible": [j == i for j in range(len(unique_types))]}, # Show only the selected trace
{"title": f"Score Difference - {etype}"}])
)
fig.update_layout(
updatemenus=[
dict(
buttons=dropdown_buttons,
direction="down",
showactive=True,
x=0.1,
y=1.15
)
],
title="Score Difference Across Element Types",
xaxis_title="Record Index",
yaxis_title="Score Difference",
showlegend=True
)
fig.show()
df_agree['score_difference'] = df_agree['similarity_score'] - df_agree['semscore']
# Calculate the required values
agree = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity = (1 - df_agree.loc[df_agree['score_difference'] > 0.01, 'score_difference']).sum()
semscore = (1 - df_agree.loc[df_agree['score_difference'] < 0.01, 'score_difference']).sum()
# Create a new DataFrame with the calculated values
summary_df = pd.DataFrame({
'Metric': ['Agree', 'Similarity', 'Semscore'],
'Value': [agree, similarity, semscore]
})
# Plot the histogram
plt.figure(figsize=(8, 6))
plt.bar(summary_df['Metric'], summary_df['Value'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metrics')
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Display the calculated values
summary_df
| Metric | Value | |
|---|---|---|
| 0 | Agree | 32.000000 |
| 1 | Similarity | 361.177934 |
| 2 | Semscore | 148.063580 |
# Count the occurrences for each metric
agree_count = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity_count = (df_agree['score_difference'] > 0.01).sum()
semscore_count = (df_agree['score_difference'] < 0.01).sum()
# Create a new DataFrame with the counts
count_summary_df = pd.DataFrame({
'Metric': ['Agree', 'Similarity', 'Semscore'],
'Count': [agree_count, similarity_count, semscore_count]
})
# Plot the histogram for counts
plt.figure(figsize=(8, 6))
plt.bar(count_summary_df['Metric'], count_summary_df['Count'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metric Counts')
plt.xlabel('Metrics')
plt.ylabel('Counts')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Display the calculated counts
count_summary_df
| Metric | Count | |
|---|---|---|
| 0 | Agree | 32 |
| 1 | Similarity | 410 |
| 2 | Semscore | 140 |
# Define the 10% margin
margin = 0.1
# Compute agreement within the +/-10% margin
agreement_margin = ((df_agree['similarity_score'] >= (df_agree['semscore'] - margin)) &
(df_agree['similarity_score'] <= (df_agree['semscore'] + margin))).sum()
# Compute disagreement outside the +/-10% margin
disagreement_margin = len(df_agree) - agreement_margin
# Display the results
agreement_disagreement_summary = pd.DataFrame({
'Metric': ['Agreement', 'Disagreement'],
'Count': [agreement_margin, disagreement_margin]
})
agreement_disagreement_summary
| Metric | Count | |
|---|---|---|
| 0 | Agreement | 306 |
| 1 | Disagreement | 244 |
# Compute proportional agreement within the ±10% margin
df_agree['agreement_proportion'] = 1 - (df_agree['similarity_score'] - df_agree['semscore']).abs() / margin
#df_agree['agreement_proportion'] = df_agree['agreement_proportion'].clip(lower=0) # Clip negative values to 0
# Plot the proportional agreement series
plt.figure(figsize=(12, 6))
plt.plot(df_agree.index, df_agree['agreement_proportion'], marker='o', linestyle='-', label='Proportional Agreement')
plt.title('Proportional Agreement Series')
plt.xlabel('Record Index')
plt.ylabel('Agreement Proportion')
plt.grid(True)
plt.legend()
plt.show()
Agree that something is bad
# Define the threshold for "low" scores
low_threshold = 2 * 0.7 # 70% of the metrics combined
# Identify rows where both metrics are below the threshold
low_agreement_df = df_agree[
((df_agree['similarity_score'] + df_agree['semscore']) < low_threshold)
#(df_agree['semscore'] < low_threshold)
]
# Display the filtered dataframe
low_agreement_df
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | agreement_proportion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 296 | § 275.0-5 | Order of the Commission | An order issued by the Commission under the Act. | [(d)] | 0.810312 | 0.5 | 0.7 | 0.3 | 0.2 | [The transformed sentence 'An order is by defi... | Terms | -0.310312 | -2.103124 |
# Display the dataframe with the proportional agreement column
df_agree.sort_values('agreement_proportion', ascending=True)
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | agreement_proportion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 296 | § 275.0-5 | Order of the Commission | An order issued by the Commission under the Act. | [(d)] | 0.810312 | 0.50 | 0.70 | 0.30 | 0.20 | [The transformed sentence 'An order is by defi... | Terms | -0.310312 | -2.103124 |
| 112 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.654959 | 0.95 | 0.90 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.295041 | -1.950409 |
| 224 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655024 | 0.95 | 0.90 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.294976 | -1.949760 |
| 168 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655041 | 0.95 | 0.90 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.294959 | -1.949594 |
| 308 | § 275.0-2 | Managing agent | Any person, including a trustee, who directs o... | [(b)(1)] | 0.655048 | 0.95 | 0.90 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.294952 | -1.949520 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 330 | § 275.0-7 | Right to vote | A person is presumed to control a corporation ... | [(b)(1)(iii), (b)(1)(i)(A)] | 0.950956 | 0.95 | 0.95 | 0.90 | 1.00 | [The transformed sentence closely mirrors the ... | Terms | -0.000956 | 0.990445 |
| 274 | § 275.0-7 | Right to vote | A person is presumed to control a corporation ... | [(b)(1)(iii), (b)(1)(i)(A)] | 0.950775 | 0.95 | 0.95 | 0.90 | 1.00 | [The transformed sentence closely mirrors the ... | Terms | -0.000775 | 0.992246 |
| 302 | § 275.0-7 | Right to vote | A person is presumed to control a corporation ... | [(b)(1)(iii), (b)(1)(i)(A)] | 0.950740 | 0.95 | 0.95 | 0.90 | 0.95 | [The transformed sentence closely mirrors the ... | Terms | -0.000740 | 0.992600 |
| 246 | § 275.0-7 | Right to vote | A person is presumed to control a corporation ... | [(b)(1)(iii), (b)(1)(i)(A)] | 0.950586 | 0.95 | 0.95 | 0.90 | 1.00 | [The transformed sentence maintains the origin... | Terms | -0.000586 | 0.994141 |
| 218 | § 275.0-7 | Right to vote | A person is presumed to control a corporation ... | [(b)(1)(iii), (b)(1)(i)(A)] | 0.950586 | 0.95 | 0.95 | 0.90 | 1.00 | [The transformed sentence maintains the origin... | Terms | -0.000586 | 0.994141 |
550 rows × 13 columns
Correlation analysis using Spearman, Kendall, and Pearson
Kendall
# Compute Kendall's Tau correlation to assess monotonicity
kendall_correlation, p_value_kendall = kendalltau(df_agree['similarity_score'], df_agree['semscore'])
kendall_correlation, p_value_kendall
(-0.04888900614249123, 0.14007660419068518)
Spearman
# Check if the relationship between similarity_score and semscore is monotonic
# Compute Spearman's rank correlation to assess monotonicity
spearman_correlation, p_value = spearmanr(df_agree['similarity_score'], df_agree['semscore'])
spearman_correlation, p_value
(-0.0678800931290666, 0.11180281052440856)
Pearson
# Calculate the correlation between similarity_score and semscore
correlation = df_agree['similarity_score'].corr(df_agree['semscore'])
correlation
-0.10703687388311389
A correlation of -0.107 indicates a weak negative linear relationship between the variables, suggesting that as one variable slightly increases, the other tends to decrease marginally. However, the relationship is negligible, indicating little to no linear association. This weak correlation implies that changes in one variable do not reliably predict changes in the other. Furthermore, the low magnitude does not preclude the possibility of a non-linear relationship, which would require alternative methods of analysis for detection.
Prompt analysis¶
Analyze number of tokens from prompts and documents from last checkpoint using gpt-4o as a reference model.
According to OpenAI | models, the maximum number of tokens (context length) for gpt-4o is 128k.
The cost to use gpt-4o is 2.50 USD per 1m tokens in 2024-10-31. Source: OpenAI | pricing.
Extract elapse times and completions from all sessions.
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json 2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
tokens_eval = {"doc_type": [], "elapsed_times": [], "completions": []}
for manager, file_info in zip(managers, file_info_list):
# Process documents
for key in manager.model_dump()["documents"].keys():
if key[1].startswith("llm_"):
doc = manager.retrieve_document(key[0], key[1])
logger.info(f"Processing: {key[0]}, {key[1]}")
elapsed_times = doc.elapsed_times
logger.debug(f"Elapsed time: {elapsed_times}")
completions = doc.completions
logger.debug(f"Completions: {completions}")
tokens_eval["doc_type"].append(key[1])
tokens_eval["elapsed_times"].append(elapsed_times)
tokens_eval["completions"].append(completions)
logger.info(f"Executions for evaluation: {len(tokens_eval['doc_type'])}")
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-13 20:58:03 - INFO - Executions for evaluation: 190
Evaluate
# Constants
reference_models = config["REFERENCE_MODELS"]["MAX_CONTEXT_LENGTH"]
price_per_million_tokens = config["REFERENCE_MODELS"]["PRICE_PER_MILLION_TOKENS"]
# Initialize an empty list to store the raw data
raw_data = []
# Assuming tokens_eval is already defined and contains the necessary data
for doc_type, elapsed_times, completions in zip(
tokens_eval["doc_type"], tokens_eval["elapsed_times"], tokens_eval["completions"]
):
for elapsed_time, completion in zip(elapsed_times, completions):
raw_data.append(
(
file_info["filename"],
doc_type,
elapsed_time,
completion["usage"],
completion["created"],
completion["model"],
)
)
prompt_analysis(raw_data, config["DEFAULT_OUTPUT_DIR"])
Overall Statistics:
Total Tokens Number of Samples Average Elapsed Time (s) Estimated Cost (USD) Average Percentage of Context Length (%) Min Created Timestamp Max Created Timestamp origin run_at
5472538 1210 3.819458 13.681345 3.533405 2024-11-30 00:08:20 2024-12-09 02:11:07 documents-2024-12-08-9.json 2024-12-13 20:58:03
Statistics by Sample Type (doc_type):
doc_type total_tokens num_samples average_elapsed_time average_tokens estimated_cost average_percentage_context_length filename run_at
llm_response 272850 60 28.951583 4547.500000 0.682125 3.552734 documents-2024-12-08-9.json 2024-12-13 20:58:03
llm_response_classification 370390 50 8.280727 7407.800000 0.925975 5.787344 documents-2024-12-08-9.json 2024-12-13 20:58:03
llm_response_transform 2480495 550 2.315991 4509.990909 6.201237 3.523430 documents-2024-12-08-9.json 2024-12-13 20:58:03
llm_validation 2348803 550 2.175670 4270.550909 5.872008 3.336368 documents-2024-12-08-9.json 2024-12-13 20:58:03
Statistics by Model:
model total_tokens num_samples average_elapsed_time average_tokens average_percentage_context_length filename run_at estimated_cost cost
gpt-4o-2024-08-06 5472538 1210 3.819458 4522.758678 3.533405 documents-2024-12-08-9.json 2024-12-13 20:58:03 13.681345 13.681345
Additional Statistics:
Average Completion Tokens Average Prompt Tokens Average Total Tokens per Sample Total Elapsed Time (s) Average Tokens per Second origin run_at
314.103306 4208.655372 4522.758678 4621.544686 1959.897985 documents-2024-12-08-9.json 2024-12-13 20:58:03
Discussion¶
TODO